{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python3.5/dist-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n",
      "  \"This module will be removed in 0.20.\", DeprecationWarning)\n"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "import sklearn.datasets\n",
    "import re\n",
    "import time\n",
    "import tensorflow as tf\n",
    "from bayes_opt import BayesianOptimization\n",
    "import pickle\n",
    "from sklearn.cross_validation import train_test_split\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "import json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def clearstring(string):\n",
    "    string = re.sub('[^\\\"\\'A-Za-z0-9 ]+', '', string)\n",
    "    string = string.split(' ')\n",
    "    string = filter(None, string)\n",
    "    string = [y.strip() for y in string]\n",
    "    string = ' '.join(string)\n",
    "    return string\n",
    "\n",
    "# because of sklean.datasets read a document as a single element\n",
    "# so we want to split based on new line\n",
    "def separate_dataset(trainset):\n",
    "    datastring = []\n",
    "    datatarget = []\n",
    "    for i in range(len(trainset.data)):\n",
    "        data_ = trainset.data[i].split('\\n')\n",
    "        # python3, if python2, just remove list()\n",
    "        data_ = list(filter(None, data_))\n",
    "        for n in range(len(data_)):\n",
    "            data_[n] = clearstring(data_[n])\n",
    "        datastring += data_\n",
    "        for n in range(len(data_)):\n",
    "            datatarget.append(trainset.target[i])\n",
    "    return datastring, datatarget"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "trainset_data = sklearn.datasets.load_files(container_path = 'data', encoding = 'UTF-8')\n",
    "trainset_data.data, trainset_data.target = separate_dataset(trainset_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "with open('dictionary_emotion.p', 'rb') as fopen:\n",
    "    dict_emotion = pickle.load(fopen)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 30% of our dataset will be used for bayesian optimization\n",
    "_, opt_X, _, opt_Y = train_test_split(trainset_data.data, trainset_data.target, test_size = 0.1)\n",
    "train_opt_X, test_opt_X, train_opt_Y, test_opt_Y = train_test_split(opt_X, opt_Y, test_size = 0.2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "class neuralnet:\n",
    "    def __init__(self, timestamp, num_hidden, size_layer, learning_rate = 0.01):\n",
    "        def activate(first_layer, second_layer, bias):\n",
    "            return tf.nn.relu(tf.matmul(first_layer, second_layer) + bias)\n",
    "        self.X = tf.placeholder(tf.float32, (None, timestamp))\n",
    "        self.Y = tf.placeholder(tf.float32, (None, len(trainset_data.target_names)))\n",
    "        input_layer = tf.Variable(tf.random_normal([timestamp, size_layer]))\n",
    "        biased_layer = tf.Variable(tf.random_normal([size_layer], stddev = 0.1))\n",
    "        output_layer = tf.Variable(tf.random_normal([size_layer, len(trainset_data.target_names)]))\n",
    "        biased_output = tf.Variable(tf.random_normal([len(trainset_data.target_names)], stddev = 0.1))\n",
    "        layers, biased = [], []\n",
    "        for i in range(num_hidden - 1):\n",
    "            layers.append(tf.Variable(tf.random_normal([size_layer, size_layer])))\n",
    "            biased.append(tf.Variable(tf.random_normal([size_layer])))\n",
    "        first_l = activate(self.X, input_layer, biased_layer)\n",
    "        next_l = activate(first_l, layers[0], biased[0])\n",
    "        for i in range(1, num_hidden - 1):\n",
    "            next_l = activate(next_l, layers[i], biased[i])\n",
    "        self.last_l = tf.matmul(next_l, output_layer) + biased_output\n",
    "        self.cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits = self.last_l, \n",
    "                                                                           labels = self.Y))\n",
    "        self.optimizer = tf.train.AdamOptimizer(learning_rate).minimize(self.cost)\n",
    "        correct_prediction = tf.equal(tf.argmax(self.last_l, 1), tf.argmax(self.Y, 1))\n",
    "        self.accuracy = tf.reduce_mean(tf.cast(correct_prediction, \"float\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def neural_network(timestamp, num_hidden, size_layer, learning_rate = 0.001, batch_size = 200, epoch = 20):\n",
    "    tf.reset_default_graph()\n",
    "    model = neuralnet(timestamp, num_hidden, size_layer, learning_rate)\n",
    "    sess = tf.InteractiveSession()\n",
    "    sess.run(tf.global_variables_initializer())\n",
    "    COST, TEST_COST, ACC, TEST_ACC = [], [], [], []\n",
    "    for i in range(epoch):\n",
    "        train_acc, train_loss = 0, 0\n",
    "        for n in range(0, (len(train_opt_X) // batch_size) * batch_size, batch_size):\n",
    "            batch_x = np.zeros((batch_size, timestamp))\n",
    "            batch_y = np.zeros((batch_size, len(trainset_data.target_names)))\n",
    "            for k in range(batch_size):\n",
    "                tokens = train_opt_X[n+k].split()[:timestamp]\n",
    "                for no, text in enumerate(tokens[::-1]):\n",
    "                    try:\n",
    "                        batch_x[k, -1 - no] = dict_emotion[text]\n",
    "                    except:\n",
    "                        continue\n",
    "                batch_y[k, train_opt_Y[n+k]] = 1.0\n",
    "            batch_x = StandardScaler().fit_transform(batch_x.T).T\n",
    "            _, loss = sess.run([model.optimizer, model.cost], feed_dict = {model.X: batch_x, \n",
    "                                                               model.Y: batch_y})\n",
    "            train_acc += sess.run(model.accuracy, feed_dict = {model.X: batch_x, \n",
    "                                                         model.Y: batch_y})\n",
    "            train_loss += loss\n",
    "        batch_x = np.zeros((len(test_opt_X), timestamp))\n",
    "        batch_y = np.zeros((len(test_opt_X), len(trainset_data.target_names)))\n",
    "        for k in range(len(test_opt_X)):\n",
    "            tokens = test_opt_X[k].split()[:timestamp]\n",
    "            for no, text in enumerate(tokens[::-1]):\n",
    "                try:\n",
    "                    batch_x[k, -1 - no] = dict_emotion[text]\n",
    "                except:\n",
    "                    continue\n",
    "            batch_y[k, test_opt_Y[k]] = 1.0\n",
    "        batch_x = StandardScaler().fit_transform(batch_x.T).T\n",
    "        TEST_COST.append(sess.run(model.cost, feed_dict = {model.X: batch_x, model.Y: batch_y}))\n",
    "        TEST_ACC.append(sess.run(model.accuracy, feed_dict = {model.X: batch_x, model.Y: batch_y}))\n",
    "        train_loss /= (len(train_opt_X) // batch_size)\n",
    "        train_acc /= (len(train_opt_X) // batch_size)\n",
    "        ACC.append(train_acc)\n",
    "        COST.append(train_loss)\n",
    "    COST = np.array(COST).mean()\n",
    "    TEST_COST = np.array(TEST_COST).mean()\n",
    "    ACC = np.array(ACC).mean()\n",
    "    TEST_ACC = np.array(TEST_ACC).mean()\n",
    "    return COST, TEST_COST, ACC, TEST_ACC"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def generate_nn(timestamp, num_hidden, size_layer):\n",
    "    global accbest\n",
    "    param = {\n",
    "        'timestamp': int(np.around(timestamp)),\n",
    "        'num_hidden' : int(np.around(num_hidden)),\n",
    "        'size_layer' : int(np.around(size_layer))\n",
    "    }\n",
    "    print(\"\\nSearch parameters %s\" % (param), file = log_file)\n",
    "    log_file.flush()\n",
    "    learning_cost, valid_cost, learning_acc, valid_acc = neural_network(**param)\n",
    "    print(\"stop after 100 iteration with train cost %f, valid cost %f, train acc %f, valid acc %f\" % (learning_cost, valid_cost, learning_acc, valid_acc))\n",
    "    if (valid_acc > accbest):\n",
    "        costbest = valid_acc\n",
    "    return valid_acc"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[31mInitialization\u001b[0m\n",
      "\u001b[94m-------------------------------------------------------------------------\u001b[0m\n",
      " Step |   Time |      Value |   num_hidden |   size_layer |   timestamp | \n",
      "stop after 100 iteration with train cost 353915.786809, valid cost 554578.562500, train acc 0.438030, valid acc 0.242030\n",
      "    1 | 00m22s | \u001b[35m   0.24203\u001b[0m | \u001b[32m      4.6905\u001b[0m | \u001b[32m    678.6922\u001b[0m | \u001b[32m    10.5156\u001b[0m | \n",
      "stop after 100 iteration with train cost 1975761858927.653076, valid cost 2002158747648.000000, train acc 0.438173, valid acc 0.236212\n",
      "    2 | 00m33s |    0.23621 |      11.2681 |     564.6557 |     20.1093 | \n",
      "stop after 100 iteration with train cost 23687387.463253, valid cost 47984616.000000, train acc 0.563673, valid acc 0.228685\n",
      "    3 | 00m37s |    0.22869 |       6.1126 |     986.5880 |     25.1476 | \n",
      "stop after 100 iteration with train cost nan, valid cost nan, train acc 0.138452, valid acc 0.129423\n",
      "    4 | 00m36s |    0.12942 |      18.8414 |     414.5340 |     18.1491 | \n",
      "stop after 100 iteration with train cost nan, valid cost nan, train acc 0.138462, valid acc 0.129423\n",
      "    5 | 01m25s |    0.12942 |      15.1939 |     993.1457 |     26.9549 | \n",
      "stop after 100 iteration with train cost 94852.398190, valid cost 117166.414062, train acc 0.372890, valid acc 0.238227\n",
      "    6 | 00m15s |    0.23823 |       4.6860 |     347.8158 |     23.0226 | \n",
      "stop after 100 iteration with train cost 5333959603.074697, valid cost 9774700544.000000, train acc 0.581869, valid acc 0.231504\n",
      "    7 | 00m45s |    0.23150 |       8.2570 |     937.0584 |     21.8834 | \n",
      "stop after 100 iteration with train cost 1059941.027784, valid cost 2126626.500000, train acc 0.533685, valid acc 0.237831\n",
      "    8 | 00m31s |    0.23783 |       4.9775 |     921.8299 |     28.9463 | \n",
      "stop after 100 iteration with train cost 157350816.188554, valid cost 132308688.000000, train acc 0.314854, valid acc 0.236098\n",
      "    9 | 00m14s |    0.23610 |       8.7053 |     249.7875 |      8.5820 | \n",
      "stop after 100 iteration with train cost 491581.180349, valid cost 850897.812500, train acc 0.468015, valid acc 0.237861\n",
      "   10 | 00m24s |    0.23786 |       5.1395 |     656.4735 |     38.4245 | \n",
      "\u001b[31mBayesian Optimization\u001b[0m\n",
      "\u001b[94m-------------------------------------------------------------------------\u001b[0m\n",
      " Step |   Time |      Value |   num_hidden |   size_layer |   timestamp | \n",
      "stop after 100 iteration with train cost 5.624642, valid cost 4.227495, train acc 0.317768, valid acc 0.319677\n",
      "   11 | 00m25s | \u001b[35m   0.31968\u001b[0m | \u001b[32m      2.0000\u001b[0m | \u001b[32m     32.0000\u001b[0m | \u001b[32m    50.0000\u001b[0m | \n",
      "stop after 100 iteration with train cost 3.139004, valid cost 2.634644, train acc 0.278649, valid acc 0.285378\n",
      "   12 | 00m13s |    0.28538 |       2.0000 |      32.0000 |      5.0000 | \n",
      "stop after 100 iteration with train cost 45.548224, valid cost 42.098869, train acc 0.257617, valid acc 0.235205\n",
      "   13 | 00m17s |    0.23520 |       2.0000 |     131.5267 |     50.0000 | \n",
      "stop after 100 iteration with train cost 33.290022, valid cost 29.083445, train acc 0.288520, valid acc 0.248393\n",
      "   14 | 00m15s |    0.24839 |       2.0000 |     817.3645 |      5.0000 | \n",
      "stop after 100 iteration with train cost 9073355559.409639, valid cost 1572524288.000000, train acc 0.241178, valid acc 0.265065\n",
      "   15 | 00m20s |    0.26507 |      20.0000 |      32.0000 |     50.0000 | \n",
      "stop after 100 iteration with train cost 107.532668, valid cost 110.002579, train acc 0.278008, valid acc 0.243421\n",
      "   16 | 00m14s |    0.24342 |       2.0000 |     279.9158 |     50.0000 | \n",
      "stop after 100 iteration with train cost 39.990774, valid cost 38.717785, train acc 0.293771, valid acc 0.240134\n",
      "   17 | 00m17s |    0.24013 |       2.0000 |     974.6444 |      5.0000 | \n",
      "stop after 100 iteration with train cost 234.833976, valid cost 309.596191, train acc 0.333083, valid acc 0.247145\n",
      "   18 | 00m17s |    0.24715 |       2.0000 |     761.4187 |     50.0000 | \n",
      "stop after 100 iteration with train cost 304.794607, valid cost 416.649902, train acc 0.347264, valid acc 0.233933\n",
      "   19 | 00m21s |    0.23393 |       2.0335 |     969.3864 |     49.9344 | \n",
      "stop after 100 iteration with train cost 192.076032, valid cost 223.615967, train acc 0.306747, valid acc 0.225363\n",
      "   20 | 00m16s |    0.22536 |       2.0000 |     520.5116 |     50.0000 | \n",
      "stop after 100 iteration with train cost 20.112922, valid cost 17.478102, train acc 0.252631, valid acc 0.236830\n",
      "   21 | 00m13s |    0.23683 |       2.0000 |      59.4493 |     50.0000 | \n",
      "stop after 100 iteration with train cost 11.722375, valid cost 8.438548, train acc 0.258480, valid acc 0.237166\n",
      "   22 | 00m09s |    0.23717 |       2.0000 |     176.6249 |      5.0000 | \n",
      "stop after 100 iteration with train cost 24.746810, valid cost 20.625248, train acc 0.285080, valid acc 0.245172\n",
      "   23 | 00m11s |    0.24517 |       2.0000 |     614.1747 |      5.0000 | \n",
      "stop after 100 iteration with train cost 73.246377, valid cost 72.095230, train acc 0.262175, valid acc 0.235750\n",
      "   24 | 00m12s |    0.23575 |       2.2305 |     207.0692 |     49.4121 | \n",
      "stop after 100 iteration with train cost 31.091234, valid cost 30.837530, train acc 0.283706, valid acc 0.232002\n",
      "   25 | 00m10s |    0.23200 |       2.1675 |     520.5295 |      5.5021 | \n",
      "stop after 100 iteration with train cost 31.164532, valid cost 25.463245, train acc 0.287310, valid acc 0.256687\n",
      "   26 | 00m12s |    0.25669 |       2.0446 |     746.9381 |      5.3940 | \n",
      "stop after 100 iteration with train cost 12.452973, valid cost 10.689310, train acc 0.273009, valid acc 0.251205\n",
      "   27 | 00m08s |    0.25121 |       2.0000 |     301.2809 |      5.0000 | \n",
      "stop after 100 iteration with train cost 182.155289, valid cost 235.371750, train acc 0.318655, valid acc 0.227222\n",
      "   28 | 00m14s |    0.22722 |       2.2118 |     588.0558 |     48.5127 | \n",
      "stop after 100 iteration with train cost 17.873599, valid cost 13.606679, train acc 0.266471, valid acc 0.270433\n",
      "   29 | 00m11s |    0.27043 |       2.7004 |      32.3658 |     33.8489 | \n",
      "stop after 100 iteration with train cost 274.785133, valid cost 370.257843, train acc 0.337462, valid acc 0.231492\n",
      "   30 | 00m17s |    0.23149 |       2.1319 |     841.1241 |     49.3260 | \n"
     ]
    }
   ],
   "source": [
    "log_file = open('nn-bayesian.log', 'a')\n",
    "accbest = 0.0\n",
    "NN_BAYESIAN = BayesianOptimization(generate_nn, \n",
    "                              {'timestamp': (5, 50),\n",
    "                               'num_hidden': (2, 20),\n",
    "                               'size_layer': (32, 1024),\n",
    "                              })\n",
    "NN_BAYESIAN.maximize(init_points = 10, n_iter = 20, acq = 'ei', xi = 0.0)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "stop after 100 iteration with train cost 5.624642, valid cost 4.227495, train acc 0.317768, valid acc 0.319677\n",
    "   11 | 00m25s |    0.31968 |       2.0000 |      32.0000 |     50.0000 | "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_X = np.zeros((len(trainset_data.data), 50))\n",
    "for i in range(data_X.shape[0]):\n",
    "    tokens = trainset_data.data[i].split()[:50]\n",
    "    for no, text in enumerate(tokens[::-1]):\n",
    "        try:\n",
    "            data_X[i, -1 - no] = dict_emotion[text]\n",
    "        except:\n",
    "            continue\n",
    "train_X, test_X, train_Y, test_Y = train_test_split(data_X, trainset_data.target, test_size = 0.2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "epoch: 0 , pass acc: 0 , current acc: 0.305403\n",
      "epoch: 1 , pass acc: 0.305403 , current acc: 0.335717\n",
      "epoch: 2 , pass acc: 0.335717 , current acc: 0.343142\n",
      "epoch: 3 , pass acc: 0.343142 , current acc: 0.345385\n",
      "epoch: 4 , pass acc: 0.345385 , current acc: 0.345745\n",
      "epoch: 5 , pass acc: 0.345745 , current acc: 0.346273\n",
      "epoch: 8 , pass acc: 0.346273 , current acc: 0.347029\n",
      "break epoch: 107\n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "tf.reset_default_graph()\n",
    "model = neuralnet(50, 2, 32)\n",
    "sess = tf.InteractiveSession()\n",
    "sess.run(tf.global_variables_initializer())\n",
    "saver = tf.train.Saver(tf.global_variables())\n",
    "EARLY_STOPPING, CURRENT_CHECKPOINT, CURRENT_ACC, EPOCH = 100, 0, 0, 0\n",
    "batch_size = 200\n",
    "while True:\n",
    "    if CURRENT_CHECKPOINT == EARLY_STOPPING:\n",
    "        print('break epoch:', EPOCH)\n",
    "        break\n",
    "    train_acc, train_loss = 0, 0\n",
    "    for n in range(0, (train_X.shape[0] // batch_size) * batch_size, batch_size):\n",
    "        batch_x = train_X[n: n+batch_size, :]\n",
    "        batch_y = np.zeros((batch_size, len(trainset_data.target_names)))\n",
    "        for k in range(batch_size):\n",
    "            batch_y[k, train_Y[n+k]] = 1.0\n",
    "        batch_x = StandardScaler().fit_transform(batch_x.T).T\n",
    "        _, loss = sess.run([model.optimizer, model.cost], feed_dict = {model.X: batch_x, \n",
    "                                                                        model.Y: batch_y})\n",
    "        train_acc += sess.run(model.accuracy, feed_dict = {model.X: batch_x, model.Y: batch_y})\n",
    "        train_loss += loss\n",
    "    batch_y = np.zeros((test_X.shape[0], len(trainset_data.target_names)))\n",
    "    for k in range(test_X.shape[0]):\n",
    "        batch_y[k, test_Y[k]] = 1.0\n",
    "    batch_x = StandardScaler().fit_transform(test_X.T).T\n",
    "    TEST_COST = sess.run(model.cost, feed_dict = {model.X: batch_x, model.Y: batch_y})\n",
    "    TEST_ACC = sess.run(model.accuracy, feed_dict = {model.X: batch_x, model.Y: batch_y})\n",
    "    train_loss /= (train_X.shape[0] // batch_size)\n",
    "    train_acc /= (train_X.shape[0] // batch_size)\n",
    "    if TEST_ACC > CURRENT_ACC:\n",
    "        print('epoch:', EPOCH, ', pass acc:', CURRENT_ACC, ', current acc:', TEST_ACC)\n",
    "        CURRENT_ACC = TEST_ACC\n",
    "        saver.save(sess, os.getcwd() + \"/model.ckpt\")\n",
    "    else:\n",
    "        CURRENT_CHECKPOINT += 1\n",
    "    EPOCH += 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
